About Data Analysis Report

This RMarkdown file contains the report of the data analysis done for the project on forecasting daily bike rental demand using time series models in R. It contains analysis such as data exploration, summary statistics and building the time series models. The final report was completed on Wed Apr 9 18:45:11 2025.

Data Description: This dataset contains the daily count of rental bike transactions between years 2011 and 2012 in Capital bikeshare system with the corresponding weather and seasonal information.

Data Source: https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset

Relevant Paper: Fanaee-T, Hadi, and Gama, Joao, ‘Event labeling combining ensemble detectors and background knowledge’, Progress in Artificial Intelligence (2013): pp. 1-15, Springer Berlin Heidelberg

Task One: Load and explore the data

Load data and install packages

# Install and load required packages
install.packages("timetk")
## Installing package into '/usr/local/lib/R/site-library'
## (as 'lib' is unspecified)
install.packages("tidyverse")
## Installing package into '/usr/local/lib/R/site-library'
## (as 'lib' is unspecified)
install.packages("lubridate")
## Installing package into '/usr/local/lib/R/site-library'
## (as 'lib' is unspecified)
install.packages("forecast")
## Installing package into '/usr/local/lib/R/site-library'
## (as 'lib' is unspecified)
install.packages("TTR")
## Installing package into '/usr/local/lib/R/site-library'
## (as 'lib' is unspecified)
install.packages("tseries")
## Installing package into '/usr/local/lib/R/site-library'
## (as 'lib' is unspecified)
library(timetk)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(TTR)
library(tseries)

# Load built-in dataset
data("bike_sharing_daily")
bike_data <- bike_sharing_daily
head(bike_data)
## # A tibble: 6 × 16
##   instant dteday     season    yr  mnth holiday weekday workingday weathersit
##     <dbl> <date>      <dbl> <dbl> <dbl>   <dbl>   <dbl>      <dbl>      <dbl>
## 1       1 2011-01-01      1     0     1       0       6          0          2
## 2       2 2011-01-02      1     0     1       0       0          0          2
## 3       3 2011-01-03      1     0     1       0       1          1          1
## 4       4 2011-01-04      1     0     1       0       2          1          1
## 5       5 2011-01-05      1     0     1       0       3          1          1
## 6       6 2011-01-06      1     0     1       0       4          1          1
## # ℹ 7 more variables: temp <dbl>, atemp <dbl>, hum <dbl>, windspeed <dbl>,
## #   casual <dbl>, registered <dbl>, cnt <dbl>
# Rebuild date column starting from "2011-01-01"
bike_data <- bike_data %>%
  mutate(date = as.Date("2011-01-01") + days(0:(n() - 1)))

Describe and explore the data

# Summary statistics
summary(bike_data)
##     instant          dteday               season            yr        
##  Min.   :  1.0   Min.   :2011-01-01   Min.   :1.000   Min.   :0.0000  
##  1st Qu.:183.5   1st Qu.:2011-07-02   1st Qu.:2.000   1st Qu.:0.0000  
##  Median :366.0   Median :2012-01-01   Median :3.000   Median :1.0000  
##  Mean   :366.0   Mean   :2012-01-01   Mean   :2.497   Mean   :0.5007  
##  3rd Qu.:548.5   3rd Qu.:2012-07-01   3rd Qu.:3.000   3rd Qu.:1.0000  
##  Max.   :731.0   Max.   :2012-12-31   Max.   :4.000   Max.   :1.0000  
##       mnth          holiday           weekday        workingday   
##  Min.   : 1.00   Min.   :0.00000   Min.   :0.000   Min.   :0.000  
##  1st Qu.: 4.00   1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.000  
##  Median : 7.00   Median :0.00000   Median :3.000   Median :1.000  
##  Mean   : 6.52   Mean   :0.02873   Mean   :2.997   Mean   :0.684  
##  3rd Qu.:10.00   3rd Qu.:0.00000   3rd Qu.:5.000   3rd Qu.:1.000  
##  Max.   :12.00   Max.   :1.00000   Max.   :6.000   Max.   :1.000  
##    weathersit         temp             atemp              hum        
##  Min.   :1.000   Min.   :0.05913   Min.   :0.07907   Min.   :0.0000  
##  1st Qu.:1.000   1st Qu.:0.33708   1st Qu.:0.33784   1st Qu.:0.5200  
##  Median :1.000   Median :0.49833   Median :0.48673   Median :0.6267  
##  Mean   :1.395   Mean   :0.49538   Mean   :0.47435   Mean   :0.6279  
##  3rd Qu.:2.000   3rd Qu.:0.65542   3rd Qu.:0.60860   3rd Qu.:0.7302  
##  Max.   :3.000   Max.   :0.86167   Max.   :0.84090   Max.   :0.9725  
##    windspeed           casual         registered        cnt      
##  Min.   :0.02239   Min.   :   2.0   Min.   :  20   Min.   :  22  
##  1st Qu.:0.13495   1st Qu.: 315.5   1st Qu.:2497   1st Qu.:3152  
##  Median :0.18097   Median : 713.0   Median :3662   Median :4548  
##  Mean   :0.19049   Mean   : 848.2   Mean   :3656   Mean   :4504  
##  3rd Qu.:0.23321   3rd Qu.:1096.0   3rd Qu.:4776   3rd Qu.:5956  
##  Max.   :0.50746   Max.   :3410.0   Max.   :6946   Max.   :8714  
##       date           
##  Min.   :2011-01-01  
##  1st Qu.:2011-07-02  
##  Median :2012-01-01  
##  Mean   :2012-01-01  
##  3rd Qu.:2012-07-01  
##  Max.   :2012-12-31
# Correlation between temperature and rentals
cor(bike_data$temp, bike_data$cnt)
## [1] 0.627494
cor(bike_data$atemp, bike_data$cnt)
## [1] 0.6310657
# Mean and median temperature per season
bike_data %>%
  group_by(season) %>%
  summarise(mean_temp = mean(temp), median_temp = median(temp))
## # A tibble: 4 × 3
##   season mean_temp median_temp
##    <dbl>     <dbl>       <dbl>
## 1      1     0.298       0.286
## 2      2     0.544       0.562
## 3      3     0.706       0.715
## 4      4     0.423       0.409
# Monthly averages
bike_data %>%
  mutate(month = month(date)) %>%
  group_by(month) %>%
  summarise(mean_temp = mean(temp), mean_hum = mean(hum), mean_wind = mean(windspeed), total_rentals = mean(cnt))
## # A tibble: 12 × 5
##    month mean_temp mean_hum mean_wind total_rentals
##    <dbl>     <dbl>    <dbl>     <dbl>         <dbl>
##  1     1     0.236    0.586     0.206         2176.
##  2     2     0.299    0.567     0.216         2655.
##  3     3     0.391    0.588     0.223         3692.
##  4     4     0.470    0.588     0.234         4485.
##  5     5     0.595    0.689     0.183         5350.
##  6     6     0.684    0.576     0.185         5772.
##  7     7     0.755    0.598     0.166         5564.
##  8     8     0.709    0.638     0.173         5664.
##  9     9     0.616    0.715     0.166         5767.
## 10    10     0.485    0.694     0.175         5199.
## 11    11     0.369    0.625     0.184         4247.
## 12    12     0.324    0.666     0.177         3404.
# Boxplot of temperature by season
boxplot(temp ~ season, data = bike_data, main = "Temperature by Season")

Task Two: Create interactive time series plots

# Interactive plot using timetk
bike_data %>%
  plot_time_series(date, cnt, .interactive = TRUE, .plotly_slider = TRUE)
# Seasonal diagnostics
bike_data %>%
  plot_seasonal_diagnostics(date, cnt)
# Anomaly diagnostics
bike_data %>%
  plot_anomaly_diagnostics(date, cnt)
## frequency = 7 observations per 1 week
## trend = 92 observations per 3 months

Task Three: Smooth time series data

# Convert to time series object
bike_ts <- ts(bike_data$cnt, frequency = 365)

# Clean anomalies and missing values
bike_clean <- tsclean(bike_ts)

# Simple Moving Average (SMA)
bike_sma <- SMA(bike_clean, n = 10)
plot(bike_sma, main = "Simple Moving Average (Order 10)")

# Simple Exponential Smoothing
bike_hw <- HoltWinters(bike_clean, beta = FALSE, gamma = FALSE)
plot(bike_hw)

Task Four: Decompse and access the stationarity of time series data

# Decomposition
decomp <- decompose(bike_ts)
plot(decomp)

# Augmented Dickey-Fuller Test for stationarity
adf.test(bike_ts)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  bike_ts
## Dickey-Fuller = -1.6351, Lag order = 9, p-value = 0.7327
## alternative hypothesis: stationary
# ACF and PACF plots
acf(bike_ts)

pacf(bike_ts)

# Differencing (if necessary)
bike_diff <- diff(bike_ts)
plot(bike_diff)

adf.test(bike_diff)
## Warning in adf.test(bike_diff): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  bike_diff
## Dickey-Fuller = -13.798, Lag order = 8, p-value = 0.01
## alternative hypothesis: stationary

Task Five: Fit and forecast time series data using ARIMA models

# Auto ARIMA model
auto_fit <- auto.arima(bike_clean)
summary(auto_fit)
## Series: bike_clean 
## ARIMA(1,0,3)(0,1,0)[365] with drift 
## 
## Coefficients:
##          ar1      ma1      ma2      ma3   drift
##       0.9683  -0.5912  -0.1279  -0.0937  5.7116
## s.e.  0.0224   0.0571   0.0617   0.0576  0.8318
## 
## sigma^2 = 986021:  log likelihood = -3042.81
## AIC=6097.63   AICc=6097.86   BIC=6121.05
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 5.853004 697.8113 385.8648 -2.699883 9.189324 0.1694626
##                      ACF1
## Training set -0.003587809
# Manual ARIMA model example
manual_fit <- arima(bike_clean, order = c(1,1,1))
summary(manual_fit)
## 
## Call:
## arima(x = bike_clean, order = c(1, 1, 1))
## 
## Coefficients:
##          ar1      ma1
##       0.3692  -0.8751
## s.e.  0.0437   0.0213
## 
## sigma^2 estimated as 661725:  log likelihood = -5928.18,  aic = 11862.37
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE     MASE       ACF1
## Training set 11.13247 812.9082 588.1202 -6.429274 19.42943 0.894064 0.01109983
# Residual diagnostics
shapiro.test(residuals(auto_fit))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(auto_fit)
## W = 0.83801, p-value < 2.2e-16
acf(residuals(auto_fit))

pacf(residuals(auto_fit))

# Forecast the next 25 values
forecast_auto <- forecast(auto_fit, h = 25)
plot(forecast_auto, main = "Forecast with Auto ARIMA")

forecast_manual <- forecast(manual_fit, h = 25)
plot(forecast_manual, main = "Forecast with Manual ARIMA")

Task Six: Findings and Conclusions